Abstract

1-2 paragraph motivating reasons

2-3 paragraph introduction

Methods

libraries

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(knitr)
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(ggtree)
## ggtree v3.12.0 For help: https://yulab-smu.top/treedata-book/
## 
## If you use the ggtree package suite in published research, please cite
## the appropriate paper(s):
## 
## Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam.
## ggtree: an R package for visualization and annotation of phylogenetic
## trees with their covariates and other associated data. Methods in
## Ecology and Evolution. 2017, 8(1):28-36. doi:10.1111/2041-210X.12628
## 
## Guangchuang Yu.  Data Integration, Manipulation and Visualization of
## Phylogenetic Trees (1st edition). Chapman and Hall/CRC. 2022,
## doi:10.1201/9781003279242
## 
## G Yu. Data Integration, Manipulation and Visualization of Phylogenetic
## Trees (1st ed.). Chapman and Hall/CRC. 2022. ISBN: 9781032233574
## 
## Attaching package: 'ggtree'
## 
## The following object is masked from 'package:tidyr':
## 
##     expand
library(TDbook) #A Companion Package for the Book "Data Integration, Manipulation and Visualization of Phylogenetic Trees" by Guangchuang Yu (2022, ISBN:9781032233574).
library(ggimage)
library(rphylopic)
## You are using rphylopic v.1.4.0. Please remember to credit PhyloPic contributors (hint: `get_attribution()`) and cite rphylopic in your work (hint: `citation("rphylopic")`).
## 
## Attaching package: 'rphylopic'
## 
## The following object is masked from 'package:ggimage':
## 
##     geom_phylopic
library(treeio)
## treeio v1.28.0 For help: https://yulab-smu.top/treedata-book/
## 
## If you use the ggtree package suite in published research, please cite
## the appropriate paper(s):
## 
## LG Wang, TTY Lam, S Xu, Z Dai, L Zhou, T Feng, P Guo, CW Dunn, BR
## Jones, T Bradley, H Zhu, Y Guan, Y Jiang, G Yu. treeio: an R package
## for phylogenetic tree input and output with richly annotated and
## associated data. Molecular Biology and Evolution. 2020, 37(2):599-603.
## doi: 10.1093/molbev/msz240
## 
## G Yu. Data Integration, Manipulation and Visualization of Phylogenetic
## Trees (1st ed.). Chapman and Hall/CRC. 2022. ISBN: 9781032233574
## 
## Guangchuang Yu.  Data Integration, Manipulation and Visualization of
## Phylogenetic Trees (1st edition). Chapman and Hall/CRC. 2022,
## doi:10.1201/9781003279242
library(tidytree)
## If you use the ggtree package suite in published research, please cite
## the appropriate paper(s):
## 
## Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam.
## ggtree: an R package for visualization and annotation of phylogenetic
## trees with their covariates and other associated data. Methods in
## Ecology and Evolution. 2017, 8(1):28-36. doi:10.1111/2041-210X.12628
## 
## Guangchuang Yu. Using ggtree to visualize data on tree-like structures.
## Current Protocols in Bioinformatics. 2020, 69:e96. doi:10.1002/cpbi.96
## 
## Attaching package: 'tidytree'
## 
## The following object is masked from 'package:treeio':
## 
##     getNodeNum
## 
## The following object is masked from 'package:stats':
## 
##     filter
library(ape)
## 
## Attaching package: 'ape'
## 
## The following objects are masked from 'package:tidytree':
## 
##     drop.tip, keep.tip
## 
## The following object is masked from 'package:treeio':
## 
##     drop.tip
## 
## The following object is masked from 'package:ggtree':
## 
##     rotate
## 
## The following object is masked from 'package:dplyr':
## 
##     where
library(TreeTools)
## 
## Attaching package: 'TreeTools'
## 
## The following object is masked from 'package:tidytree':
## 
##     MRCA
## 
## The following object is masked from 'package:treeio':
## 
##     MRCA
## 
## The following object is masked from 'package:ggtree':
## 
##     MRCA
library(phytools)
## Loading required package: maps
## 
## Attaching package: 'maps'
## 
## The following object is masked from 'package:purrr':
## 
##     map
## 
## 
## Attaching package: 'phytools'
## 
## The following object is masked from 'package:TreeTools':
## 
##     as.multiPhylo
## 
## The following object is masked from 'package:treeio':
## 
##     read.newick
library(ggnewscale)
library(ggtreeExtra)
## ggtreeExtra v1.14.0 For help: https://yulab-smu.top/treedata-book/
## 
## If you use the ggtree package suite in published research, please cite
## the appropriate paper(s):
## 
## S Xu, Z Dai, P Guo, X Fu, S Liu, L Zhou, W Tang, T Feng, M Chen, L
## Zhan, T Wu, E Hu, Y Jiang, X Bo, G Yu. ggtreeExtra: Compact
## visualization of richly annotated phylogenetic data. Molecular Biology
## and Evolution. 2021, 38(9):4039-4042. doi: 10.1093/molbev/msab166
library(ggstar)

NEON tables

Mag table

NEON_MAGs <- read_csv("data/NEON/GOLD_Study_ID_Gs0161344_NEON_2024_4_21.csv") %>%
  # remove columns that are not needed for data analysis
  select(-c(`GOLD Study ID`, `Bin Methods`, `Created By`, `Date Added`, `Bin Lineage`)) %>%
  # create a new column with the Assembly Type
  mutate("Assembly Type" = case_when(`Genome Name` == "NEON combined assembly" ~ `Genome Name`,
                            TRUE ~ "Individual")) %>%
  mutate_at("Assembly Type", str_replace, "NEON combined assembly", "Combined") %>%
  mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "d__", "") %>%  
  mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "p__", "") %>%
  mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "c__", "") %>%
  mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "o__", "") %>%
  mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "f__", "") %>%
  mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "g__", "") %>%
  mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "s__", "") %>%
  separate(`GTDB-Tk Taxonomy Lineage`, c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"), ";", remove = FALSE) %>%
  mutate_at("Domain", na_if,"") %>%
  mutate_at("Phylum", na_if,"") %>%
  mutate_at("Class", na_if,"") %>%
  mutate_at("Order", na_if,"") %>%
  mutate_at("Family", na_if,"") %>%
  mutate_at("Genus", na_if,"") %>%
  mutate_at("Species", na_if,"") %>%
 
  # Get rid of the the common string "Soil microbial communities from "
  mutate_at("Genome Name", str_replace, "Terrestrial soil microbial communities from ", "") %>%
  # Use the first `-` to split the column in two
  separate(`Genome Name`, c("Site","Sample Name"), " - ") %>%
  # Get rid of the the common string "S-comp-1"
  mutate_at("Sample Name", str_replace, "-comp-1", "") %>%
  # separate the Sample Name into Site ID and plot info
  separate(`Sample Name`, c("Site ID","subplot.layer.date"), "_", remove = FALSE,) %>%
  # separate the plot info into 3 columns
  separate(`subplot.layer.date`, c("Subplot", "Layer", "Date"), "-")
## Rows: 1754 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (8): Bin ID, Genome Name, Bin Quality, Bin Lineage, GTDB-Tk Taxonomy L...
## dbl  (10): IMG Genome ID, Bin Completeness, Bin Contamination, Total Number ...
## date  (1): Date Added
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 624 rows [1131, 1132,
## 1133, 1134, 1135, 1136, 1137, 1138, 1139, 1140, 1141, 1142, 1143, 1144, 1145,
## 1146, 1147, 1148, 1149, 1150, ...].

Lab 9 Infrastructure

NEON_MAGs_bact_ind <- NEON_MAGs %>% 
  filter(Domain == "Bacteria") %>% 
  filter(`Assembly Type` == "Individual") 
NEON_MAGs_bact_co <- NEON_MAGs %>% 
  filter(Domain == "Bacteria") %>% 
  filter(`Assembly Type` == "Combined") 

Metagenome table

NEON_metagenomes <- read_tsv("data/NEON/exported_img_data_Gs0161344_NEON.tsv") %>%
  select(-c(`Domain`, `Sequencing Status`, `Sequencing Center`)) %>%
  rename(`Genome Name` = `Genome Name / Sample Name`) %>%
  filter(str_detect(`Genome Name`, 're-annotation', negate = T)) %>%
  filter(str_detect(`Genome Name`, 'WREF plot', negate = T))
## Rows: 176 Columns: 46
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (18): Domain, Sequencing Status, Study Name, Genome Name / Sample Name, ...
## dbl (16): taxon_oid, IMG Genome ID, Depth In Meters, Elevation In Meters, Ge...
## lgl (12): Altitude In Meters, Chlorophyll Concentration, Longhurst Code, Lon...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
NEON_metagenomes <- NEON_metagenomes %>%
  # Get rid of the the common string "Soil microbial communities from "
  mutate_at("Genome Name", str_replace, "Terrestrial soil microbial communities from ", "") %>%
  # Use the first `-` to split the column in two
  separate(`Genome Name`, c("Site","Sample Name"), " - ") %>%
  # Get rid of the the common string "-comp-1"
  mutate_at("Sample Name", str_replace, "-comp-1", "") %>%
  # separate the Sample Name into Site ID and plot info
  separate(`Sample Name`, c("Site ID","subplot.layer.date"), "_", remove = FALSE,) %>%
  # separate the plot info into 3 columns
  separate(`subplot.layer.date`, c("Subplot", "Layer", "Date"), "-")
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [53].

NEON chemistry data

NEON_chemistry <- read_tsv("data/NEON/neon_plot_soilChem1_metadata.tsv") %>%
  # remove -COMP from genomicsSampleID
  mutate_at("genomicsSampleID", str_replace, "-COMP", "")
## Rows: 87 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr   (5): genomicsSampleID, siteID, plotID, nlcdClass, horizon
## dbl  (11): decimalLatitude, decimalLongitude, elevation, soilTemp, d15N, org...
## date  (1): collectionDate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Join data

NEON_MAGs_metagenomes_chemistry <- NEON_MAGs %>%
  left_join(NEON_metagenomes, by = "Sample Name") %>%
  left_join(NEON_chemistry, by = c("Sample Name" = "genomicsSampleID")) %>%
  rename("label" = "Bin ID")
NEON_MAGs1 <- NEON_MAGs%>%
  select(`Sample Name`,`Site ID`, `GTDB-Tk Taxonomy Lineage`)
NEON_metagenomes1 <- NEON_metagenomes%>%
  select(`Sample Name`,`Site ID`, `Ecosystem Subtype`)

Filter data

site data

Site_MAGs <- NEON_MAGs %>%
  filter(`Site ID` == "KONZ")
Site_metagenomes <- NEON_metagenomes %>%
  filter(`Site ID` == "KONZ")
Site_chemistry <- NEON_chemistry %>%
  filter(`siteID` == "KONZ")

full joining data

NEON1 <- NEON_MAGs %>%
  full_join(NEON_metagenomes, by = "Sample Name")
NEON_full <- NEON1 %>%
  full_join(NEON_chemistry, by = c("Sample Name" = "genomicsSampleID"))
NEON2 <- Site_MAGs %>%
  full_join(Site_metagenomes, by = "Sample Name")
Site_full <- NEON2 %>%
  full_join(Site_chemistry, by = c("Sample Name" = "genomicsSampleID"))

filter for bacteroidota

NEON_Bact <- NEON_full %>%
  filter(Phylum =='Bacteroidota')
NEON_site <- NEON_full %>%
  filter(`siteID` == "KONZ")

# gets no bacteroidota present, so will not use DELETE eventually

Trees data

NEON_MAGs_metagenomes_chemistry <- NEON_MAGs %>%
left_join(NEON_metagenomes, by = "Sample Name") %>%
left_join(NEON_chemistry, by = c("Sample Name" = "genomicsSampleID")) %>%
rename("label" = "Bin ID")
NEON_MAGs_metagenomes_chemistry_noblank <- NEON_MAGs_metagenomes_chemistry %>%
rename("AssemblyType" = "Assembly Type") %>%
rename("BinCompleteness" = "Bin Completeness") %>%
rename("BinContamination" = "Bin Contamination") %>%
rename("TotalNumberofBases" = "Total Number of Bases") %>%
rename("EcosystemSubtype" = "Ecosystem Subtype")
tree_arc <- read.tree("data/NEON/gtdbtk.ar53.decorated.tree")
tree_bac <- read.tree("data/NEON/gtdbtk.bac120.decorated.tree")
node_vector_bac = c(tree_bac$tip.label,tree_bac$node.label)
grep("Bacteroidota", node_vector_bac, value = TRUE)
## [1] "'1.0:p__Bacteroidota'"
match(grep("Bacteroidota", node_vector_bac, value = TRUE), node_vector_bac)
## [1] 2507
tree_bac_preorder <- Preorder(tree_bac)
tree_Bacteroidota <- Subtree(tree_bac_preorder, 2507)

Results

Sankey

knitr::include_url("sankey-NEON_MAG_ind_pavian.txt (1).html")
knitr::include_url("sankey-NEON_MAG_co_pavian_bacteroidota.html")
knitr::include_url("sankey-NEON_MAG_site_pavian.txt (1).html")

##Konza

NEON_MAGs_bact_ind %>%
filter(is.na(Class) | is.na(Order) | is.na(Domain) | is.na(Phylum) | is.na(Family) | is.na(Genus)) %>%
ggplot(aes(x = fct_infreq(`Site`))) +
geom_bar() +
coord_flip() +
labs(title = 'Novel Bacteria by Site')

NEON_MAGs_bact_co %>%
filter(is.na(Class) | is.na(Order) | is.na(Domain) | is.na(Phylum) | is.na(Family) | is.na(Genus)) %>%
ggplot(aes(x = fct_infreq(`Site`))) +
geom_bar() +
coord_flip() +
labs(title = 'Novel Bacteria by Site')

NEON_MAGs_bact_ind %>%
ggplot(aes(x = fct_rev(fct_infreq(Site)), fill = Phylum)) +
geom_bar() +
coord_flip() +
labs(title = 'Total MAGs at Each Site')

NEON_MAGs_bact_ind2 <- NEON_MAGs_bact_ind %>%
filter(`Site ID`== "KONZ")
NEON_MAGs_bact_ind2 %>%
ggplot(aes(x = Phylum, fill = Subplot)) +
geom_bar(position = position_dodge2(width = 0.9, preserve = "single")) +
labs(title = 'MAG Count for each Phylum at the KONZA Site')+
coord_flip()

Question 1

Chemical distribution of the site. Konza

Question 2

How does chemistry effect the distribution of the bacteria at Konza

Nitrogen % vs. Organic C %

Bacteroidota only

ggplotly(
ggplot(data= NEON_Bact ,aes(x = organicCPercent, y = nitrogenPercent)) +
geom_point(aes(color= Family))
)

all phylums

ggplotly(
ggplot(data= NEON_full ,aes(x = organicCPercent, y = nitrogenPercent)) +
geom_point(aes(color= Phylum))
)

soil in water ph vs. soil in cacl ph

only bacteroidota:

ggplotly(
ggplot(data= NEON_Bact ,aes(x = soilInWaterpH , y = soilInCaClpH)) +
geom_point(aes(color= Family))
)

all phylums:

ggplotly(
ggplot(data= NEON_full ,aes(x = soilInWaterpH , y = soilInCaClpH)) +
geom_point(aes(color= Phylum))
)

at Konza:

ggplotly(
ggplot(data= Site_full ,aes(x = soilInWaterpH , y = soilInCaClpH)) +
geom_point(aes(color= Phylum))
)

CN ratio vs. family

Question 3

How does temperature effect distribution

only bacteroidota:

ggplotly(
ggplot(data= NEON_Bact ,aes(x = fct_infreq(`Site ID.x`), y = soilTemp)) +
geom_point(aes(color= Family))+
geom_boxplot() +
theme(axis.text.x = element_text(angle=45, vjust=1, hjust=1))
)
## Warning: Removed 11 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

all phylums:

ggplotly(
ggplot(data= NEON_full ,aes(x = fct_infreq(`Site ID.x`), y = soilTemp)) +
geom_point(aes(color= Phylum))+
geom_boxplot() +
theme(axis.text.x = element_text(angle=45, vjust=1, hjust=1))
)
## Warning: Removed 625 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Soil temperature vs. ecosystem

Only Bacteroidota:

ggplotly(
ggplot(data=NEON_Bact ,aes(x = soilTemp, y = fct_infreq(`Ecosystem Subtype`))) +
geom_point(aes(color= Family))
)

all phylums:

ggplotly(
ggplot(data=NEON_full ,aes(x = soilTemp, y = fct_infreq(`Ecosystem Subtype`))) +
geom_point(aes(color= Phylum))
)

at Konza:

ggplotly(
ggplot(data = Site_full ,aes(x = soilTemp, y = fct_infreq(`Ecosystem Subtype`))) +
geom_point(aes(color= Phylum))
)

Question 4

What kinds of bacteria thrive in our location

only bacteroidota:

Question 5

What kinds of biomes does are bacteria live in

Only Bacteroidota:

ggplotly(
ggplot(data=NEON_Bact ,aes(x = Family, y = fct_infreq(`Ecosystem Subtype`))) +
geom_point(aes(color= Family))
)

All phylums:

ggplotly(
ggplot(data=NEON_full ,aes(x = Phylum, y = fct_infreq(`Ecosystem Subtype`))) +
geom_point(aes(color= Phylum))
)

CN vs Family

NEON_full %>%
ggplot(aes(x = CNratio, y = Family)) +
labs(title = "New Graph - Josh") +
geom_point(aes(color='Family')) +
coord_flip()
## Warning: Removed 1678 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggtree(tree_Bacteroidota, layout="circular", branch.length="none") %<+%
NEON_MAGs_metagenomes_chemistry +
geom_point2(mapping=aes(color=`Ecosystem Subtype`, size=`Total Number of Bases`)) +
new_scale_fill() +
geom_fruit(
data=NEON_MAGs_metagenomes_chemistry_noblank,
geom=geom_tile,
mapping=aes(y=label, x=1, fill= AssemblyType),
offset=0.08, # The distance between external layers, default is 0.03 times of x range of tree.
pwidth=0.25 # width of the external layer, default is 0.2 times of x range of tree.
)
## ! The following column names/name: Site.x, Sample Name, Site ID.x, Subplot.x, Layer.x, Date.x, IMG Genome ID.x, Bin Quality, GTDB-Tk Taxonomy Lineage, Domain, Phylum, Class, Order, Family, Genus, Species, 5s rRNA, 16s rRNA, 23s rRNA, tRNA Genes, Gene Count, Scaffold Count, taxon_oid, Study Name, Site.y, Site ID.y, Subplot.y, Layer.y, Date.y, IMG Genome ID.y, GOLD Study ID, Ecosystem, Ecosystem Category, Ecosystem Type, Specific Ecosystem, Altitude In Meters, Chlorophyll Concentration, Depth In Meters, Elevation In Meters, Geographic Location, Habitat, Isolation, Isolation Country, Latitude, Longhurst Code, Longhurst Description, Longitude, Nitrate Concentration, Oxygen Concentration, pH, Pressure, Salinity, Salinity Concentration, Sample Collection Date, Sample Collection Temperature, Subsurface In Meters, Genome Size  * assembled, Gene Count  * assembled, Scaffold Count  * assembled, Genome MetaBAT Bin Count  * assembled, Genome EukCC Bin Count  * assembled, CRISPR Count  * assembled, GC Count  * assembled, GC  * assembled, Coding Base Count  * assembled, Coding Base Count %  * assembled, CDS Count  * assembled, CDS %  * assembled, siteID, plotID, nlcdClass, decimalLatitude, decimalLongitude, elevation, collectionDate, horizon, soilTemp, d15N, organicd13C, nitrogenPercent, organicCPercent, CNratio, soilInWaterpH, soilInCaClpH are/is the same to tree data, the tree data column names are : label, y, angle, Site.x, Sample Name, Site ID.x, Subplot.x, Layer.x, Date.x, IMG Genome ID.x, Bin Quality, GTDB-Tk Taxonomy Lineage, Domain, Phylum, Class, Order, Family, Genus, Species, Bin Completeness, Bin Contamination, Total Number of Bases, 5s rRNA, 16s rRNA, 23s rRNA, tRNA Genes, Gene Count, Scaffold Count, Assembly Type, taxon_oid, Study Name, Site.y, Site ID.y, Subplot.y, Layer.y, Date.y, IMG Genome ID.y, GOLD Study ID, Ecosystem, Ecosystem Category, Ecosystem Subtype, Ecosystem Type, Specific Ecosystem, Altitude In Meters, Chlorophyll Concentration, Depth In Meters, Elevation In Meters, Geographic Location, Habitat, Isolation, Isolation Country, Latitude, Longhurst Code, Longhurst Description, Longitude, Nitrate Concentration, Oxygen Concentration, pH, Pressure, Salinity, Salinity Concentration, Sample Collection Date, Sample Collection Temperature, Subsurface In Meters, Genome Size  * assembled, Gene Count  * assembled, Scaffold Count  * assembled, Genome MetaBAT Bin Count  * assembled, Genome EukCC Bin Count  * assembled, CRISPR Count  * assembled, GC Count  * assembled, GC  * assembled, Coding Base Count  * assembled, Coding Base Count %  * assembled, CDS Count  * assembled, CDS %  * assembled, siteID, plotID, nlcdClass, decimalLatitude, decimalLongitude, elevation, collectionDate, horizon, soilTemp, d15N, organicd13C, nitrogenPercent, organicCPercent, CNratio, soilInWaterpH, soilInCaClpH.
## Warning: Removed 32 rows containing missing values or values outside the scale range
## (`geom_point_g_gtree()`).